How to do it?:

Submission: Submit the link on Github of the assignment to Canvas


  1. Write the following function. Give examples to test your function.

Hint: Similar function

# Recall the function from assignment 15.
mean_impute <- function(x)
{
  if(is.numeric(x))
  {
    # Find the mean of x
    mean_of_x <- mean(x, na.rm = TRUE)
    # Replace the missing by the mode
    library(tidyr)
    x <- replace_na(x, mean_of_x) 
  }
  return(x)    
}

# Using the 'mean_impute()' in another function to extend its use to the entire dataframe. 
numeric_impute <- function(d)
{
  for (i in 1:length(d))
  {
      d[[i]] <- mean_impute(d[[i]])
  }
  return(d)
}
# Importing a data frame to test the function. This time I'll be using the Adult Census Income with Missing Values data set from canvas.
library(tidyverse)
df <- read_csv("adult_census_missing.csv")

# Showing that there are missing values in the data frame beforehand.
colSums(is.na(df))
##            age      workclass         fnlwgt      education  education.num 
##             30             34              0             15              0 
## marital.status     occupation   relationship           race            sex 
##             26             35              0              0             24 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              8              0              0             15              0
# Notice that only numeric variables with missing values in NA form, which are`age` and `capital.gain`, had the missing values replaced by the mean. The nice thing about this function is that it allows the user to impute all missing values of numeric variables in the data frame without having to do so column by column.
df <- numeric_impute(df)
colSums(is.na(df))
##            age      workclass         fnlwgt      education  education.num 
##              0             34              0             15              0 
## marital.status     occupation   relationship           race            sex 
##             26             35              0              0             24 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              0              0              0             15              0

  1. Write the following function. Give examples to test your function.

Hint: Use If-statement to combine the function in Problem 1 and the function in this example

# Recall the function created from assignment 15.
mode_impute <- function(x)
{
  if(!is.numeric(x))
  {
    # Find the mode of x
    mode_of_x <- names(sort(-table(x)))[1]
    # Replace the missing by the mode
    library(tidyr)
    x <- replace_na(x, mode_of_x) 
  }
  return(x)    
}

# Updating function to handle an entire data frame as the input.
category_impute <- function(d)
{
  for (i in 1:length(d))
  {
      d[[i]] <- mode_impute(d[[i]])
  }
  return(d)
}
# Creating the function to handle all missing values in a data frame. This function is just a combination of the function from the previous question and the one made above.
remove_missing_values <- function(d)
{
  d <- numeric_impute(d)
  d <- category_impute(d)
}
# Resetting the data frame.
df <- read_csv("adult_census_missing.csv")

# Showing that there are missing values in the data frame beforehand.
colSums(is.na(df))
##            age      workclass         fnlwgt      education  education.num 
##             30             34              0             15              0 
## marital.status     occupation   relationship           race            sex 
##             26             35              0              0             24 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              8              0              0             15              0
df <- remove_missing_values(df)
colSums(is.na(df))
##            age      workclass         fnlwgt      education  education.num 
##              0              0              0              0              0 
## marital.status     occupation   relationship           race            sex 
##              0              0              0              0              0 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              0              0              0              0              0

  1. Write the following function. Give examples to test your function.

Hint: Similar function

# Creating the function to plot all non-numeric variables.
plot_category <- function(d)
{
  library(ggplot2)
  for (i in 1:length(d))
  {
    if (!is.numeric(d[[i]]))
    {
      print(ggplot(d, aes(x = d[[i]]))+ 
              geom_bar( fill = "seagreen2",
                   color = "black")+
              labs(x = names(d)[i]))
    }
  }
}

# Testing the function on the data frame.
plot_category(df)


  1. Write the following function. Give examples to test your function.

Hint: Similar function

plot_category2 <- function(d)
{
  library(ggplot2)
  l <- length(d)
  for (i in 1:(l-1))
    for (j in (i+1):l)
  {
    if (!is.numeric(d[[i]])& (!is.numeric(d[[j]])))
    {
      print(ggplot(d, aes(x = d[[i]], fill = d[[j]]))+ 
              geom_bar()+labs(x = names(d)[i], fill = names(d)[j]))
    }
  }
}

plot_category2(df)


  1. Write the following function. Give examples to test your function.

Hint: Combine this function, this function, and the function in Question 4. One way to combine is creating a new function, quick_plot, and call these three functions within quic_kplot.

# Defining the density plot from canvas first, so it can be called in the function that will be created.
density_plot2 <- function(d)
{
  library(ggplot2)
  l <- length(d)
  for (i in 1:(l-1))
    for (j in (i+1):l)
  {
    if (is.numeric(d[[i]])& (!is.numeric(d[[j]])))
    {
      print(ggplot(d, aes(x = d[[i]], color = d[[j]]))+ 
              geom_density()+labs(x = names(d)[i], color = names(d)[j]))
    }
  }
}

# Defining the scatter plot function from canvas first, so it can be called in the `quick_plot` function.
scatter_plot <- function(d)
{
  library(ggplot2)
  l <- length(d)
  for (i in 1:(l-1))
    for (j in (i+1):l)
  {
    if (is.numeric(d[[i]])&is.numeric(d[[j]]))
    {
      print(ggplot(d, aes(x = d[[i]], y = d[[j]]))+ 
              geom_point()+
              labs(x = names(d)[i], y = names(d)[j]))
    }
  }
}

# Creating the `quick_plot` function.
quickplot <- function(d)
{
  plot_category2(d)
  density_plot2(d)
  scatter_plot(d) 
}

# Testing the `quick_plot` function.
quickplot(df)

## Warning: Groups with fewer than two data points have been dropped.
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning: Groups with fewer than two data points have been dropped.
## no non-missing arguments to max; returning -Inf

## Warning: Groups with fewer than two data points have been dropped.
## no non-missing arguments to max; returning -Inf

## Warning: Groups with fewer than two data points have been dropped.
## no non-missing arguments to max; returning -Inf

## Warning: Groups with fewer than two data points have been dropped.
## no non-missing arguments to max; returning -Inf

## Warning: Groups with fewer than two data points have been dropped.
## no non-missing arguments to max; returning -Inf


Bonus Test:

# (Bonus): Testing all functions on a new data set. This time I'll be using the Titanic data set with missing values from Canvas.
df <- read_csv("titanic_missing.csv")
head(df)
## # A tibble: 6 × 9
##   Survived Pclass Sex      Age SibSp Parch Fare    Cabin Embarked
##      <dbl> <chr>  <chr>  <dbl> <dbl> <dbl> <chr>   <chr> <chr>   
## 1        0 3      male      22     1     0 7.25    <NA>  S       
## 2        1 1      female    38     1     0 71.2833 C85   C       
## 3        1 3      female    26     0     0 7.925   <NA>  S       
## 4        1 1      female    35     1     0 53.1    C123  S       
## 5        0 3      male      35     0     0 8.05    <NA>  S       
## 6        0 3      male      NA     0     0 8.4583  <NA>  Q
# Checking for missing values.
colSums(is.na(df))
## Survived   Pclass      Sex      Age    SibSp    Parch     Fare    Cabin 
##        0        0        0      177        0        0        0      687 
## Embarked 
##        2
# Using the function from question 1 to handle the missing values within the numeric variable `age`.
df1 <- numeric_impute(df)
colSums(is.na(df1))
## Survived   Pclass      Sex      Age    SibSp    Parch     Fare    Cabin 
##        0        0        0        0        0        0        0      687 
## Embarked 
##        2
# Using the function from question 2 to take care of all the missing values within the dataframe.
df <- remove_missing_values(df)
colSums(is.na(df))
## Survived   Pclass      Sex      Age    SibSp    Parch     Fare    Cabin 
##        0        0        0        0        0        0        0        0 
## Embarked 
##        0
# Testing the function created in question 3 on the new data.
plot_category(df)

# Testing the function created in question 4 on the new data frame.
plot_category2(df)

# Testing the function created in question 5 on the new data set.
quickplot(df)